import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import pickle

atp_tennis = pd.read_csv('atp_tennis.csv')

# Create a new DataFrame with separate rows for each player
player_data = pd.DataFrame(columns=['Player', 'Rank', 'Win_Loss_Ratio', 'Surface'])

# Add Player_1 data to the new DataFrame
player_data_1 = atp_tennis[['Player_1', 'Rank_1', 'Win_Loss_Ratio_1', 'Surface']].copy()
player_data_1.columns = ['Player', 'Rank', 'Win_Loss_Ratio', 'Surface']

# Add Player_2 data to the new DataFrame
player_data_2 = atp_tennis[['Player_2', 'Rank_2', 'Win_Loss_Ratio_2', 'Surface']].copy()
player_data_2.columns = ['Player', 'Rank', 'Win_Loss_Ratio', 'Surface']

# Concatenate Player_1 and Player_2 data
player_data = pd.concat([player_data_1, player_data_2], ignore_index=True)

# Group the DataFrame by player and compute the average rank and win/loss ratio for each unique player
unique_player_data = player_data.groupby('Player').agg({'Rank': 'mean', 'Win_Loss_Ratio': 'mean', 'Surface': 'count'}).reset_index()
unique_player_data.columns = ['Player', 'Avg_Rank', 'Avg_Win_Loss_Ratio', 'Match_Count']

# Define ranking groups based on average rank
unique_player_data['Rank_Group'] = pd.cut(unique_player_data['Avg_Rank'], bins=[0, 50, 200, np.inf], labels=['Top-ranked', 'Mid-ranked', 'Low-ranked'])

# Calculate the average win/loss ratio for each ranking group and surface type
grouped_data = player_data.merge(unique_player_data[['Player', 'Rank_Group']], on='Player')
grouped_data = grouped_data.groupby(['Rank_Group', 'Surface']).agg({'Win_Loss_Ratio': 'mean'}).reset_index()

# Create a bar chart comparing win/loss ratios across ranking groups and surface types
plt.figure(figsize=(12, 6))
sns.barplot(x='Surface', y='Win_Loss_Ratio', hue='Rank_Group', data=grouped_data)
plt.title('Win/Loss Ratios by Surface Type and Ranking Group')
plt.ylabel('Average Win/Loss Ratio')
plt.savefig("./ref_result/barplot.png")
plt.show()

# Perform statistical tests for each ranking group
anova_results = {}
for group in ['Top-ranked', 'Mid-ranked', 'Low-ranked']:
    group_data = player_data.merge(unique_player_data[unique_player_data['Rank_Group'] == group][['Player', 'Rank_Group']], on='Player')
    hard_data = group_data['Win_Loss_Ratio'][group_data['Surface'] == 'Hard'].dropna()
    clay_data = group_data['Win_Loss_Ratio'][group_data['Surface'] == 'Clay'].dropna()
    grass_data = group_data['Win_Loss_Ratio'][group_data['Surface'] == 'Grass'].dropna()

    if len(hard_data) > 0 and len(clay_data) > 0 and len(grass_data) > 0:
        anova_result = stats.f_oneway(hard_data, clay_data, grass_data)
        anova_results[group] = anova_result.pvalue
    else:
        anova_results[group] = None

print(anova_results)
pickle.dump(anova_results,open("./ref_result/anova_results.pkl","wb"))